In [1]:
import preprocess
import visualize
import selection
import relevance
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn import tree
from genetic_selection import GeneticSelectionCV
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import lime
import lime.lime_tabular
import shap
from lime import submodular_pick
from eli5 import show_weights, show_prediction, explain_weights_dfs, explain_prediction_dfs

import warnings
warnings.filterwarnings("ignore")
In [2]:
# load data for personal internet use disorder
data = preprocess.load_piu_data()
data.head()
Out[2]:
Gender Achievement Economic status Internet Use (in years) Internet Use (hours per week) Internet Use (hours per day) Internet Use (in holiday) Attitude about time on the Internet Politics Business ... Coffee Alcohol Drepressive temperament Cyclothymic temperament Hyperthymic temperament Irritable temperament Anxiety temperament PIU PIUcutoff Cutoff_Class
0 0.0 3.0 3.0 9.0 1.0 1.0 2.0 0.0 5.0 1.0 ... 1.0 1.0 0.857143 0.571429 0.500 0.571429 0.714286 NaN NaN NaN
1 0.0 4.0 4.0 5.0 0.0 0.0 2.0 0.0 2.0 1.0 ... 0.0 0.0 0.285714 0.857143 0.875 0.571429 0.571429 25.0 0.0 0.0
2 0.0 4.0 3.0 9.0 1.0 1.0 1.0 0.0 2.0 3.0 ... 0.0 0.0 0.142857 0.142857 1.000 0.142857 0.000000 28.0 0.0 0.0
3 0.0 4.0 3.0 7.0 0.0 0.0 1.0 0.0 1.0 2.0 ... 0.0 0.0 0.285714 0.142857 1.000 0.714286 0.428571 31.0 0.0 0.0
4 0.0 4.0 1.0 2.0 0.0 0.0 2.0 0.0 1.0 3.0 ... NaN 0.0 0.428571 0.428571 0.625 0.857143 0.714286 28.0 0.0 0.0

5 rows × 59 columns

In [3]:
# print column names to check if everything is OK
print(data.columns)
Index(['Gender', 'Achievement', 'Economic status', 'Internet Use (in years)',
       'Internet Use (hours per week)', 'Internet Use (hours per day)',
       'Internet Use (in holiday)', 'Attitude about time on the Internet',
       'Politics', 'Business', 'Sports', 'Computers and technology',
       'Arts and culture', 'Education', 'Pop culture', 'Pornography', 'Music',
       'Travel/tourism', 'Health and medicine', 'Science', 'Religion',
       'Communication by e-mail', 'Social networks',
       'Communication on the forum', 'Communication on the blog',
       'Targeted Internet search', 'Surfing', 'Expert Advice',
       'Search for favorite websites', 'Reading the news', 'Online games',
       'Reading and downloading books and texts',
       'Downloading music and movies', 'Internet for school', 'Online courses',
       'Everyday FB use', 'Average time spent on FB', 'FB use - reading posts',
       'FB use - publishing statuses', 'FB use - sharing music, photos etc.',
       'FB use –gaming', 'FB use – chatting', 'FB use – visiting groups',
       'Sports – days in a  week', 'Sports – intensity', 'Sports – in minutes',
       'Energy drinks', 'Fast Food', 'Smoker', 'Coffee', 'Alcohol',
       'Drepressive temperament', 'Cyclothymic temperament',
       'Hyperthymic temperament', 'Irritable temperament',
       'Anxiety temperament', 'PIU', 'PIUcutoff', 'Cutoff_Class'],
      dtype='object')
In [4]:
# check columns with NaN values
print(data.isna().sum().sort_values(ascending=False))
Cutoff_Class                               104
PIUcutoff                                  104
PIU                                        104
Average time spent on FB                    95
Internet Use (in years)                     91
Sports – in minutes                         86
Internet Use (hours per day)                72
Communication by e-mail                     60
Communication on the blog                   59
Sports – intensity                          55
Online games                                54
Online courses                              54
Pornography                                 53
Communication on the forum                  52
Expert Advice                               49
Reading and downloading books and texts     48
Fast Food                                   46
Religion                                    45
Travel/tourism                              43
Pop culture                                 43
Business                                    43
Computers and technology                    42
Energy drinks                               41
Health and medicine                         40
Education                                   40
Arts and culture                            40
Downloading music and movies                39
Science                                     39
Search for favorite websites                39
Internet for school                         37
Targeted Internet search                    35
Sports – days in a  week                    34
Reading the news                            34
Internet Use (hours per week)               34
Coffee                                      33
Politics                                    33
Music                                       32
Sports                                      30
Everyday FB use                             29
Attitude about time on the Internet         27
Surfing                                     24
Economic status                             23
Internet Use (in holiday)                   23
Smoker                                      22
Alcohol                                     21
Social networks                             20
Achievement                                 10
Gender                                       2
FB use – visiting groups                     0
FB use – chatting                            0
FB use –gaming                               0
FB use - sharing music, photos etc.          0
Drepressive temperament                      0
Cyclothymic temperament                      0
Hyperthymic temperament                      0
Irritable temperament                        0
Anxiety temperament                          0
FB use - publishing statuses                 0
FB use - reading posts                       0
dtype: int64
In [5]:
data = preprocess.process_standardization(data)
data.head
Out[5]:
<bound method NDFrame.head of       Gender  Achievement  Economic status  Internet Use (in years)  \
0        0.0    -1.366523        -0.389762                 1.528074   
1        0.0    -0.145708         0.564106                -0.237632   
2        0.0    -0.145708        -0.389762                 1.528074   
3        0.0    -0.145708        -0.389762                 0.645221   
4        0.0    -0.145708        -2.297499                -1.561912   
...      ...          ...              ...                      ...   
2108     0.0    -0.145708        -2.297499                -1.561912   
2109     0.0     1.075106        -0.389762                -0.237632   
2110     0.0    -1.366523        -0.389762                -0.237632   
2111     0.0    -0.145708        -2.297499                -1.120486   
2112     0.0    -1.366523         0.564106                -1.120486   

      Internet Use (hours per week)  Internet Use (hours per day)  \
0                               1.0                      0.646391   
1                               0.0                     -0.703951   
2                               1.0                      0.646391   
3                               0.0                     -0.703951   
4                               0.0                     -0.703951   
...                             ...                           ...   
2108                            1.0                      0.646391   
2109                            1.0                     -0.703951   
2110                            1.0                     -0.703951   
2111                            1.0                     -0.703951   
2112                            1.0                     -0.703951   

      Internet Use (in holiday)  Attitude about time on the Internet  \
0                      1.272123                                  0.0   
1                      1.272123                                  0.0   
2                     -0.141347                                  0.0   
3                     -0.141347                                  0.0   
4                      1.272123                                  0.0   
...                         ...                                  ...   
2108                   1.272123                                  0.0   
2109                   1.272123                                  0.0   
2110                  -0.141347                                  0.0   
2111                  -0.141347                                  1.0   
2112                   1.272123                                  0.0   

      Politics  Business  ...  Coffee  Alcohol  Drepressive temperament  \
0     3.554807 -0.685844  ...     1.0      1.0                 2.595336   
1     0.367281 -0.685844  ...     0.0      0.0                 0.217216   
2     0.367281  1.335077  ...     0.0      0.0                -0.377314   
3    -0.695228  0.324616  ...     0.0      0.0                 0.217216   
4    -0.695228  1.335077  ...     NaN      0.0                 0.811746   
...        ...       ...  ...     ...      ...                      ...   
2108 -0.695228 -0.685844  ...     0.0      1.0                 2.000806   
2109 -0.695228 -0.685844  ...     0.0      0.0                -0.377314   
2110 -0.695228 -0.685844  ...     0.0      1.0                -0.971844   
2111  0.367281  3.355997  ...     0.0      1.0                 1.406276   
2112  0.367281 -0.685844  ...     0.0      1.0                -0.971844   

      Cyclothymic temperament  Hyperthymic temperament  Irritable temperament  \
0                    0.186837                -1.031386               0.273212   
1                    1.193945                 0.567918               0.273212   
2                   -1.323826                 1.101019              -1.171234   
3                   -1.323826                 1.101019               0.754694   
4                   -0.316717                -0.498284               1.236176   
...                       ...                      ...                    ...   
2108                -0.316717                 1.101019               0.754694   
2109                -1.323826                 0.567918              -1.171234   
2110                -1.827380                 1.101019              -0.689752   
2111                 0.690391                 1.101019               1.236176   
2112                -0.820272                 0.034817              -0.208270   

      Anxiety temperament   PIU  PIUcutoff  Cutoff_Class  
0                1.139434   NaN        NaN           NaN  
1                0.641477  25.0        0.0           0.0  
2               -1.350353  28.0        0.0           0.0  
3                0.143519  31.0        0.0           0.0  
4                1.139434  28.0        0.0           0.0  
...                   ...   ...        ...           ...  
2108            -0.354438  46.0        1.0           1.0  
2109            -0.852396  30.0        0.0           0.0  
2110            -1.350353  39.0        1.0           0.0  
2111             0.143519  39.0        1.0           0.0  
2112            -1.350353  18.0        0.0           0.0  

[2113 rows x 59 columns]>
In [6]:
# process columns with NaN values
data = preprocess.process_columns_with_nan_values(data)
# check again columns with NaN values
print(data.isna().sum().sort_values(ascending=False))
Gender                                     0
Sports – intensity                         0
Downloading music and movies               0
Internet for school                        0
Online courses                             0
Everyday FB use                            0
Average time spent on FB                   0
FB use - reading posts                     0
FB use - publishing statuses               0
FB use - sharing music, photos etc.        0
FB use –gaming                             0
FB use – chatting                          0
FB use – visiting groups                   0
Sports – days in a  week                   0
Sports – in minutes                        0
Online games                               0
Energy drinks                              0
Fast Food                                  0
Smoker                                     0
Coffee                                     0
Alcohol                                    0
Drepressive temperament                    0
Cyclothymic temperament                    0
Hyperthymic temperament                    0
Irritable temperament                      0
Anxiety temperament                        0
PIU                                        0
PIUcutoff                                  0
Reading and downloading books and texts    0
Reading the news                           0
Achievement                                0
Pop culture                                0
Economic status                            0
Internet Use (in years)                    0
Internet Use (hours per week)              0
Internet Use (hours per day)               0
Internet Use (in holiday)                  0
Attitude about time on the Internet        0
Politics                                   0
Business                                   0
Sports                                     0
Computers and technology                   0
Arts and culture                           0
Education                                  0
Pornography                                0
Search for favorite websites               0
Music                                      0
Travel/tourism                             0
Health and medicine                        0
Science                                    0
Religion                                   0
Communication by e-mail                    0
Social networks                            0
Communication on the forum                 0
Communication on the blog                  0
Targeted Internet search                   0
Surfing                                    0
Expert Advice                              0
Cutoff_Class                               0
dtype: int64
In [7]:
data = preprocess.process_outliers(data)
Original number of rows: 2009
Number of normal rows detected: 1908
Number of outliers detected: 101
Number of rows after eliminating outliers: 1908
In [8]:
# predictors and class
x_data = data[data.columns.difference(['Cutoff_Class', 'PIUcutoff', 'PIU' ])]
y_data = data['Cutoff_Class']
# split dataset to training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.05, random_state=1234, shuffle=True) #, stratify=y_data)
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
In [9]:
# prepare data frame to count top features
df_top_features = pd.DataFrame({'attr_names' : x_train.columns.values})
df_top_features['top_count'] = 0
df_top_features['top_count_xai'] = 0
df_top_features['top_count_selection'] = 0
In [10]:
df = pd.DataFrame({'Cutoff_Class':y_train})
visualize.show_value_distribution_per_column(df, 'Cutoff_Class')
No description has been provided for this image
In [11]:
# oversample data
oversampler = SMOTE()
x_train, y_train = oversampler.fit_resample(x_train, y_train)
df = pd.DataFrame({'Cutoff_Class':y_train})
visualize.show_value_distribution_per_column(df, 'Cutoff_Class')
No description has been provided for this image
In [12]:
k_features = selection.select_k_best_features(x_train, y_train, 0)
visualize.show_selected_features(
    title = 'Feature selection using univariate statistical test (ANOVA F-value)',
    label_x = 'Feature score',
    label_y = 'Feature name',
    data_x = k_features['values'],
    data_y = k_features['attr_names']
)
df_top_features = relevance.add_top_features_count('select_k_best', df_top_features, k_features, 10)
No description has been provided for this image
In [13]:
trained_models = dict()
parameters = dict()
# hyperparameters for decision tree classifier
parameters['decision_tree'] = dict(
    max_leaf_nodes = list(range(2,10)), 
    min_samples_split = [2,3,4],
    max_depth = [2,3,5,10],
    criterion = ['gini', 'entropy']
)
trained_models['decision_tree'] = DecisionTreeClassifier()

# hyperparameters for random forest classifier
parameters['random_forest'] = dict(
    n_estimators = [10,20,40], 
    criterion = ['gini', 'entropy', 'log_loss'],
    # max_features = list(range(3,8)),
    max_depth = [2,4,9]
)
trained_models['random_forest'] = RandomForestClassifier()

# hyperparameters for adaboost classifier
parameters['adaboost'] = dict(
    n_estimators = [10,20,40],
    learning_rate = [0.01,0.1,1],
    algorithm = ['SAMME', 'SAMME.R']
)
trained_models['adaboost'] = AdaBoostClassifier()

# hyperparameters for xgboost classifier
parameters['xgboost'] = dict(
    max_depth = [1,3,5,7,9,11],
    learning_rate = [0.01,0.1,1,10,100],
    subsample = [0.5, 0.7, 1],
    n_estimators = [5,50,70]
)
trained_models['xgboost'] = XGBClassifier()

#hyperparameters for bagging classifier
parameters['bagging'] = dict(
    #n_estimators = [300, 400, 500, 600, 700, 800],
    n_estimators = [300, 600],
    # max_features = [0.90, 0.92, 0.95, 1.0],
    bootstrap = [True, False],
    bootstrap_features = [True, False],
)
trained_models['bagging'] = BaggingClassifier()

# hyperparameters for gradient boosting classifier
parameters['gradient_boosting'] = dict(
    n_estimators = [5,50,70],
    max_depth = [1,3,5,7,9,11],
    learning_rate = [0.01,0.1,1,10,100],
    #loss = ['log_loss', 'exponential'],
    #criterion = ['friedman_mse', 'squared_error']
)
trained_models['gradient_boosting'] = GradientBoostingClassifier()

# train classifier using GridSearchCV for each of the selected models
# print("Not optimizing parameters every time")
# uncomment only to recheck hyperparameter optimization
# for m in trained_models:
#     trained_models[m] = selection.train_best_classifier(m, trained_models[m], parameters[m], x_train, y_train, x_test, y_test)

# instead create models with optimized parameters
trained_models = dict()
trained_models['decision_tree'] = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=7)
trained_models['random_forest'] = RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=40)
trained_models['adaboost'] = AdaBoostClassifier(learning_rate=1, n_estimators=40)
trained_models['xgboost'] = XGBClassifier(max_depth=9, learning_rate=0.1, n_estimators=50)
trained_models['bagging'] = BaggingClassifier(bootstrap_features=False, bootstrap=True, n_estimators=600)
trained_models['gradient_boost'] = GradientBoostingClassifier(learning_rate=1, max_depth=11, n_estimators=70)
In [14]:
# compute classification metrics for every classification model
df_models = selection.calculate_classifier_metrics(trained_models, x_train, y_train)
print(df_models)
       Model name  Accuracy  Precision  Recall     F1
0   decision_tree     0.722      0.782   0.620  0.690
1   random_forest     0.782      0.800   0.752  0.774
2        adaboost     0.771      0.767   0.779  0.773
3         xgboost     0.794      0.802   0.781  0.790
4         bagging     0.808      0.826   0.781  0.802
5  gradient_boost     0.800      0.814   0.781  0.796
In [15]:
# show metrics for trained models
visualize.show_model_comparasion(df_models)
No description has been provided for this image
In [16]:
# fit all models with train data
trained_models = dict()
trained_models['decision_tree'] = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=7)
trained_models['random_forest'] = RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=40)
trained_models['adaboost'] = AdaBoostClassifier(learning_rate=1, n_estimators=40)
trained_models['xgboost'] = XGBClassifier(max_depth=9, learning_rate=0.1, n_estimators=50)
for m in trained_models:
    trained_models[m].fit(x_train, y_train)
    
df_features = selection.get_feature_importances(trained_models, x_train, y_train)
visualize.show_selected_features(
    title = 'Feature selection - Tree models',
    label_x = 'Feature score',
    label_y = 'Feature name',
    data_x = df_features['values'],
    data_y = df_features['attr_names']
)

df_top_features = relevance.add_top_features_count('tree_importance', df_top_features, df_features, 10)
No description has been provided for this image
In [17]:
# show features selected by LassoCV
df_features = selection.get_feature_scores_lasso(x_train, y_train)
visualize.show_selected_features(
    title = 'Feature selection - LassoCV',
    label_x = 'Feature score',
    label_y = 'Feature name',
    data_x = df_features['values'],
    data_y = df_features['attr_names']
)
df_top_features = relevance.add_top_features_count('lasso', df_top_features, df_features, 10)
No description has been provided for this image
In [18]:
# show features selected by ElasticNET
df_features = selection.get_feature_scores_elenet(x_train, y_train)
visualize.show_selected_features(
    title = 'Feature selection - ElasticNET',
    label_x = 'Feature score',
    label_y = 'Feature name',
    data_x = df_features['values'],
    data_y = df_features['attr_names']
)
df_top_features = relevance.add_top_features_count('elenet', df_top_features, df_features, 10)
No description has been provided for this image
In [19]:
# fit one representative model
predictions = relevance.get_predictions('random_forest', x_train, y_train, x_test, y_test)
Train accuracy: 0.97
Test accuracy: 0.74
Confusion Matrix: 
[[45 10]
 [15 26]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.75      0.82      0.78        55
         1.0       0.72      0.63      0.68        41

    accuracy                           0.74        96
   macro avg       0.74      0.73      0.73        96
weighted avg       0.74      0.74      0.74        96

In [20]:
# feature selection using genetic algorithms
selector = GeneticSelectionCV(
    estimator=predictions['model'],
    cv=10,
    scoring="accuracy",
    n_population=20, #50,
    n_generations=5, #40,
    n_jobs=10,
    verbose=False
)
selector.fit(x_train, y_train)
visualize.show_genetic_selection_results(selector, x_train)
                                   features  is_used
0                               Achievement     True
1                                   Alcohol     True
2                       Anxiety temperament     True
3                          Arts and culture    False
4       Attitude about time on the Internet     True
5                  Average time spent on FB     True
6                                  Business     True
7                                    Coffee     True
8                   Communication by e-mail     True
9                 Communication on the blog     True
10               Communication on the forum     True
11                 Computers and technology    False
12                  Cyclothymic temperament     True
13             Downloading music and movies     True
14                  Drepressive temperament     True
15                          Economic status     True
16                                Education     True
17                            Energy drinks     True
18                          Everyday FB use     True
19                            Expert Advice     True
20             FB use - publishing statuses     True
21                   FB use - reading posts     True
22      FB use - sharing music, photos etc.     True
23                        FB use – chatting     True
24                 FB use – visiting groups     True
25                           FB use –gaming     True
26                                Fast Food     True
27                                   Gender     True
28                      Health and medicine     True
29                  Hyperthymic temperament     True
30             Internet Use (hours per day)    False
31            Internet Use (hours per week)     True
32                Internet Use (in holiday)     True
33                  Internet Use (in years)     True
34                      Internet for school    False
35                    Irritable temperament     True
36                                    Music     True
37                           Online courses     True
38                             Online games     True
39                                 Politics     True
40                              Pop culture     True
41                              Pornography    False
42  Reading and downloading books and texts     True
43                         Reading the news     True
44                                 Religion     True
45                                  Science     True
46             Search for favorite websites     True
47                                   Smoker    False
48                          Social networks     True
49                                   Sports    False
50                 Sports – days in a  week     True
51                      Sports – in minutes     True
52                       Sports – intensity     True
53                                  Surfing     True
54                 Targeted Internet search     True
55                           Travel/tourism    False
In [21]:
# add top features statistics
df_features = pd.DataFrame({'attr_names': x_train.columns.values, 'values':selector.support_})
df_top_features = relevance.add_top_features_count('genetic_selection', df_top_features, df_features, 10)
In [22]:
# fit one representative model and get it's predictions
predictions = relevance.get_predictions('random_forest', x_train, y_train, x_test, y_test)
Train accuracy: 0.97
Test accuracy: 0.74
Confusion Matrix: 
[[46  9]
 [16 25]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.74      0.84      0.79        55
         1.0       0.74      0.61      0.67        41

    accuracy                           0.74        96
   macro avg       0.74      0.72      0.73        96
weighted avg       0.74      0.74      0.74        96

In [23]:
# use LIME to explain results for a single prediction
explainer = lime.lime_tabular.LimeTabularExplainer(
    x_train.values, 
    feature_names=x_test.columns.values.tolist(),
    class_names=['PIU no', 'PIU yes'], 
    verbose=False, 
    mode='classification'
)

predict_fn = lambda x: predictions['model'].predict_proba(x)

print('True positive predictions.\n')
i = predictions['tp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
    x_test.loc[i, x_test.columns.values.tolist()].astype(int).values, 
    predict_fn, 
    num_features=10
)
map = exp.as_map()
print(map)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])

print('True negative predictions.\n')
i = predictions['tn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
    x_test.loc[i, x_test.columns.values.tolist()].astype(int).values, 
    predict_fn, 
    num_features=10
)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])

print('False positive predictions.\n')
i = predictions['fp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
    x_test.loc[i, x_test.columns.values.tolist()].astype(int).values, 
    predict_fn, 
    num_features=10
)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])

print('False negative predictions.\n')
i = predictions['fn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))
exp = explainer.explain_instance(
    x_test.loc[i, x_test.columns.values.tolist()].astype(int).values, 
    predict_fn, 
    num_features=10
)
exp.show_in_notebook(show_table=True)
figure = exp.as_pyplot_figure(label = exp.available_labels()[0])
True positive predictions.

Expected PIU value: 1.0.

Predicted PIU value: 1.0.

{1: [(31, 0.11128951703580371), (4, -0.07341759951466524), (3, -0.05341176289759748), (5, 0.05169847495873944), (48, 0.046315630954042925), (12, 0.03946902747313003), (17, 0.03157528748484642), (46, -0.027174525064780956), (21, -0.024469859196085277), (1, -0.02371051659455407)]}
True negative predictions.

Expected PIU value: 0.0.

Predicted PIU value: 0.0.

False positive predictions.

Expected PIU value: 0.0.

Predicted PIU value: 1.0.

False negative predictions.

Expected PIU value: 1.0.

Predicted PIU value: 0.0.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [24]:
# Let's use SP-LIME to return explanations on a sample data set 
# and obtain a non-redundant global decision perspective of the black-box model
sp_exp = submodular_pick.SubmodularPick(explainer, 
                                        x_test[x_test.columns.values.tolist()].values,
                                        predict_fn, 
                                        num_features=5,
                                        num_exps_desired=5
                                       )
[exp.show_in_notebook() for exp in sp_exp.sp_explanations]
print('SP-LIME Explanations.')
[exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_exp.sp_explanations]
print('SP-LIME Local Explanations')
SP-LIME Explanations.
SP-LIME Local Explanations
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [25]:
# creating SHAP explainer
explainer = shap.TreeExplainer(predictions['model'])
shap_values = explainer.shap_values(x_train)
# show summary plot of global features relevance 
shap.initjs()
shap.summary_plot(shap_values, x_train)
No description has been provided for this image
No description has been provided for this image
In [26]:
# add top features statistics
df = relevance.get_shap_values_as_data_frame(x_train, shap_values)
print(df)
df_top_features = relevance.add_top_features_count('SHAP', df_top_features, k_features, 10)
df = relevance.get_shap_values_as_data_frame(x_train, shap_values, 0)
print(df)
df = relevance.get_shap_values_as_data_frame(x_train, shap_values, 1)
print(df)
                                 attr_names    values
1                  Average time spent on FB  0.118057
2             Internet Use (hours per week)  0.113004
3                   Cyclothymic temperament  0.102898
4                          Arts and culture  0.067168
5                     Irritable temperament  0.051360
6              Internet Use (hours per day)  0.049661
7       Attitude about time on the Internet  0.049350
8                                   Surfing  0.038018
9                              Online games  0.037523
10                          Social networks  0.035075
11                              Achievement  0.030403
12             Search for favorite websites  0.028973
13                      Anxiety temperament  0.028000
14                          Everyday FB use  0.025982
15                              Pornography  0.025213
16                            Energy drinks  0.024699
17                  Drepressive temperament  0.022546
18                 Computers and technology  0.019394
19                  Hyperthymic temperament  0.016688
20                            Expert Advice  0.016658
21                Internet Use (in holiday)  0.015385
22               Communication on the forum  0.012028
23                 Sports – days in a  week  0.011033
24                                  Alcohol  0.010773
25                  Internet Use (in years)  0.010741
26                      Health and medicine  0.010317
27                                Education  0.010009
28                Communication on the blog  0.009742
29                                   Gender  0.009558
30                         Reading the news  0.009292
31  Reading and downloading books and texts  0.009183
32                        FB use – chatting  0.008665
33                       Sports – intensity  0.008551
34                           Online courses  0.008364
35                   FB use - reading posts  0.008040
36                              Pop culture  0.007248
37                      Internet for school  0.006577
38                      Sports – in minutes  0.006416
39                          Economic status  0.006410
40                 Targeted Internet search  0.006394
41                                 Business  0.006302
42             Downloading music and movies  0.005368
43                           Travel/tourism  0.005223
44                                 Religion  0.005200
45                                   Sports  0.004869
46                                  Science  0.004594
47                  Communication by e-mail  0.004344
48                                 Politics  0.004127
49                                   Coffee  0.004120
50                 FB use – visiting groups  0.003894
51      FB use - sharing music, photos etc.  0.003178
52             FB use - publishing statuses  0.002496
53                                Fast Food  0.002167
54                                    Music  0.001923
55                                   Smoker  0.001714
56                           FB use –gaming  0.001160
                                 attr_names    values
1                  Average time spent on FB  0.059029
2             Internet Use (hours per week)  0.056502
3                   Cyclothymic temperament  0.051449
4                          Arts and culture  0.033584
5                     Irritable temperament  0.025680
6              Internet Use (hours per day)  0.024831
7       Attitude about time on the Internet  0.024675
8                                   Surfing  0.019009
9                              Online games  0.018762
10                          Social networks  0.017538
11                              Achievement  0.015201
12             Search for favorite websites  0.014486
13                      Anxiety temperament  0.014000
14                          Everyday FB use  0.012991
15                              Pornography  0.012606
16                            Energy drinks  0.012349
17                  Drepressive temperament  0.011273
18                 Computers and technology  0.009697
19                  Hyperthymic temperament  0.008344
20                            Expert Advice  0.008329
21                Internet Use (in holiday)  0.007692
22               Communication on the forum  0.006014
23                 Sports – days in a  week  0.005517
24                                  Alcohol  0.005386
25                  Internet Use (in years)  0.005371
26                      Health and medicine  0.005158
27                                Education  0.005004
28                Communication on the blog  0.004871
29                                   Gender  0.004779
30                         Reading the news  0.004646
31  Reading and downloading books and texts  0.004592
32                        FB use – chatting  0.004332
33                       Sports – intensity  0.004276
34                           Online courses  0.004182
35                   FB use - reading posts  0.004020
36                              Pop culture  0.003624
37                      Internet for school  0.003288
38                      Sports – in minutes  0.003208
39                          Economic status  0.003205
40                 Targeted Internet search  0.003197
41                                 Business  0.003151
42             Downloading music and movies  0.002684
43                           Travel/tourism  0.002612
44                                 Religion  0.002600
45                                   Sports  0.002434
46                                  Science  0.002297
47                  Communication by e-mail  0.002172
48                                 Politics  0.002063
49                                   Coffee  0.002060
50                 FB use – visiting groups  0.001947
51      FB use - sharing music, photos etc.  0.001589
52             FB use - publishing statuses  0.001248
53                                Fast Food  0.001083
54                                    Music  0.000961
55                                   Smoker  0.000857
56                           FB use –gaming  0.000580
                                 attr_names    values
1                  Average time spent on FB  0.059029
2             Internet Use (hours per week)  0.056502
3                   Cyclothymic temperament  0.051449
4                          Arts and culture  0.033584
5                     Irritable temperament  0.025680
6              Internet Use (hours per day)  0.024831
7       Attitude about time on the Internet  0.024675
8                                   Surfing  0.019009
9                              Online games  0.018762
10                          Social networks  0.017538
11                              Achievement  0.015201
12             Search for favorite websites  0.014486
13                      Anxiety temperament  0.014000
14                          Everyday FB use  0.012991
15                              Pornography  0.012606
16                            Energy drinks  0.012349
17                  Drepressive temperament  0.011273
18                 Computers and technology  0.009697
19                  Hyperthymic temperament  0.008344
20                            Expert Advice  0.008329
21                Internet Use (in holiday)  0.007692
22               Communication on the forum  0.006014
23                 Sports – days in a  week  0.005517
24                                  Alcohol  0.005386
25                  Internet Use (in years)  0.005371
26                      Health and medicine  0.005158
27                                Education  0.005004
28                Communication on the blog  0.004871
29                                   Gender  0.004779
30                         Reading the news  0.004646
31  Reading and downloading books and texts  0.004592
32                        FB use – chatting  0.004332
33                       Sports – intensity  0.004276
34                           Online courses  0.004182
35                   FB use - reading posts  0.004020
36                              Pop culture  0.003624
37                      Internet for school  0.003288
38                      Sports – in minutes  0.003208
39                          Economic status  0.003205
40                 Targeted Internet search  0.003197
41                                 Business  0.003151
42             Downloading music and movies  0.002684
43                           Travel/tourism  0.002612
44                                 Religion  0.002600
45                                   Sports  0.002434
46                                  Science  0.002297
47                  Communication by e-mail  0.002172
48                                 Politics  0.002063
49                                   Coffee  0.002060
50                 FB use – visiting groups  0.001947
51      FB use - sharing music, photos etc.  0.001589
52             FB use - publishing statuses  0.001248
53                                Fast Food  0.001083
54                                    Music  0.000961
55                                   Smoker  0.000857
56                           FB use –gaming  0.000580
In [27]:
# plot explanations for single instance predictions
print('True positive predictions.\n')
i = predictions['tp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], x_test.iloc[i,:])
True positive predictions.

Expected PIU value: 1.0.

Predicted PIU value: 1.0.

No description has been provided for this image
Out[27]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [28]:
print('True negative predictions.\n')
i = predictions['tn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_test.iloc[i,:])
True negative predictions.

Expected PIU value: 0.0.

Predicted PIU value: 0.0.

No description has been provided for this image
Out[28]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [29]:
print('False positive predictions.\n')
i = predictions['fp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_test.iloc[i,:])
False positive predictions.

Expected PIU value: 0.0.

Predicted PIU value: 1.0.

No description has been provided for this image
Out[29]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [30]:
print('False negative predictions.\n')
i = predictions['fn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], x_test.iloc[i,:])
False negative predictions.

Expected PIU value: 1.0.

Predicted PIU value: 0.0.

No description has been provided for this image
Out[30]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [31]:
shap.initjs()
shap.decision_plot(explainer.expected_value[1], shap_values[1][0,:], x_train.iloc[3,:])
No description has been provided for this image
No description has been provided for this image
In [32]:
# use eli5 (permutation feature importance) to show global feature relevance
show_weights(
    predictions['model'], 
    targets=[0, 1], 
    target_names=['PIU no', 'PIU yes'],
    feature_names=x_train.columns.values.tolist(),
    top=len(x_train.columns)+1
)
Out[32]:
Weight Feature
0.0774 ± 0.0903 Average time spent on FB
0.0651 ± 0.0584 Cyclothymic temperament
0.0578 ± 0.0813 Internet Use (hours per week)
0.0366 ± 0.0359 Irritable temperament
0.0344 ± 0.0477 Arts and culture
0.0281 ± 0.0452 Online games
0.0280 ± 0.0537 Attitude about time on the Internet
0.0274 ± 0.0299 Anxiety temperament
0.0272 ± 0.0305 Surfing
0.0252 ± 0.0425 Internet Use (hours per day)
0.0227 ± 0.0476 Social networks
0.0220 ± 0.0380 Search for favorite websites
0.0217 ± 0.0351 Drepressive temperament
0.0205 ± 0.0188 Sports – days in a week
0.0203 ± 0.0231 Pornography
0.0203 ± 0.0205 Internet Use (in years)
0.0198 ± 0.0218 Hyperthymic temperament
0.0191 ± 0.0184 Reading the news
0.0188 ± 0.0287 Achievement
0.0177 ± 0.0197 Sports – in minutes
0.0175 ± 0.0247 Computers and technology
0.0170 ± 0.0527 Energy drinks
0.0163 ± 0.0289 Everyday FB use
0.0163 ± 0.0197 Communication on the blog
0.0159 ± 0.0209 Reading and downloading books and texts
0.0154 ± 0.0229 Health and medicine
0.0153 ± 0.0155 Economic status
0.0151 ± 0.0209 Internet Use (in holiday)
0.0145 ± 0.0168 Pop culture
0.0135 ± 0.0165 Internet for school
0.0134 ± 0.0168 Education
0.0133 ± 0.0197 Expert Advice
0.0124 ± 0.0163 Communication by e-mail
0.0123 ± 0.0154 Religion
0.0122 ± 0.0159 Science
0.0121 ± 0.0191 Travel/tourism
0.0120 ± 0.0152 Downloading music and movies
0.0119 ± 0.0135 Sports – intensity
0.0118 ± 0.0223 Communication on the forum
0.0115 ± 0.0173 Sports
0.0112 ± 0.0187 Business
0.0107 ± 0.0217 FB use - reading posts
0.0105 ± 0.0145 Online courses
0.0099 ± 0.0142 Targeted Internet search
0.0093 ± 0.0145 Politics
0.0086 ± 0.0124 Alcohol
0.0076 ± 0.0216 FB use – chatting
0.0070 ± 0.0111 Gender
0.0064 ± 0.0106 Coffee
0.0062 ± 0.0110 FB use – visiting groups
0.0048 ± 0.0098 FB use - sharing music, photos etc.
0.0047 ± 0.0101 FB use - publishing statuses
0.0040 ± 0.0095 Music
0.0036 ± 0.0071 Fast Food
0.0030 ± 0.0064 Smoker
0.0024 ± 0.0056 FB use –gaming
In [33]:
# add top features statistics
df = explain_weights_dfs(
    predictions['model'], 
    targets=[0, 1], 
    target_names=['PIU no', 'PIU yes'],
    feature_names=x_train.columns.values.tolist(),
    top=len(x_train.columns)+1
)
df = pd.DataFrame({'attr_names' : df['feature_importances'].iloc[:, 0], 'values' : df['feature_importances'].iloc[:, 1]})
df_top_features = relevance.add_top_features_count('eli5', df_top_features, k_features, 10)
In [34]:
# plot explanations for single instance predictions
print('True positive predictions.\n')
i = predictions['tp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

show_prediction(
    predictions['model'],
    x_test.iloc[i,:],
    targets=[0, 1], 
    target_names=['PIU no', 'PIU yes'],
    feature_names=x_test.columns.values.tolist(),
    top=(10, 10)
)
True positive predictions.

Expected PIU value: 1.0.

Predicted PIU value: 1.0.

Out[34]:

y=PIU yes (probability 0.735) top features

Contribution? Feature
+0.502 <BIAS>
+0.081 Internet Use (hours per week)
+0.072 Average time spent on FB
+0.026 Internet Use (hours per day)
+0.022 Health and medicine
+0.018 Surfing
+0.017 Search for favorite websites
+0.016 Anxiety temperament
+0.015 Pornography
+0.015 Social networks
… 23 more positive …
… 9 more negative …
-0.005 FB use – visiting groups
-0.006 Alcohol
-0.007 Sports
-0.008 Gender
-0.014 Irritable temperament
-0.014 Drepressive temperament
-0.016 Religion
-0.016 Business
-0.024 Attitude about time on the Internet
-0.033 Arts and culture
In [35]:
print('True negative predictions.\n')
i = predictions['tn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

show_prediction(
    predictions['model'],
    x_test.iloc[i,:],
    targets=[0, 1], 
    target_names=['PIU no', 'PIU yes'],
    feature_names=x_test.columns.values.tolist(),
    top=len(x_test.columns)+1
)
True negative predictions.

Expected PIU value: 0.0.

Predicted PIU value: 0.0.

Out[35]:

y=PIU no (probability 0.795) top features

Contribution? Feature
+0.498 <BIAS>
+0.068 Cyclothymic temperament
+0.050 Average time spent on FB
+0.039 Social networks
+0.028 Internet Use (hours per week)
+0.024 Online games
+0.024 Internet Use (hours per day)
+0.020 Arts and culture
+0.020 Achievement
+0.018 Energy drinks
+0.016 Gender
+0.016 Drepressive temperament
+0.013 Internet Use (in years)
+0.011 Computers and technology
+0.010 Search for favorite websites
+0.009 Pornography
+0.008 Attitude about time on the Internet
+0.008 Communication by e-mail
+0.007 Religion
+0.007 Communication on the blog
+0.004 Expert Advice
+0.004 Health and medicine
+0.004 Education
+0.004 FB use – chatting
+0.003 Sports
+0.002 FB use - sharing music, photos etc.
+0.002 Downloading music and movies
+0.001 Sports – in minutes
+0.001 FB use - reading posts
+0.001 Anxiety temperament
+0.001 Economic status
+0.000 Politics
+0.000 FB use –gaming
+0.000 Science
-0.000 Music
-0.000 Business
-0.001 Surfing
-0.001 Internet for school
-0.001 Reading and downloading books and texts
-0.001 Travel/tourism
-0.001 Online courses
-0.002 Alcohol
-0.002 Coffee
-0.003 Communication on the forum
-0.004 Pop culture
-0.005 Targeted Internet search
-0.006 Irritable temperament
-0.008 Everyday FB use
-0.008 Reading the news
-0.013 Sports – intensity
-0.018 Hyperthymic temperament
-0.022 Sports – days in a week
-0.029 Internet Use (in holiday)
In [36]:
print('False positive predictions.\n')
i = predictions['fp'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

show_prediction(
    predictions['model'],
    x_test.iloc[i,:],
    targets=[0, 1], 
    target_names=['PIU no', 'PIU yes'],
    feature_names=x_test.columns.values.tolist(),
    top=len(x_test.columns)+1
)
False positive predictions.

Expected PIU value: 0.0.

Predicted PIU value: 1.0.

Out[36]:

y=PIU yes (probability 0.654) top features

Contribution? Feature
+0.502 <BIAS>
+0.098 Cyclothymic temperament
+0.079 Average time spent on FB
+0.069 Internet Use (hours per week)
+0.050 Irritable temperament
+0.021 Online games
+0.019 Arts and culture
+0.017 Travel/tourism
+0.016 Achievement
+0.014 Targeted Internet search
+0.011 Hyperthymic temperament
+0.011 Social networks
+0.009 Science
+0.008 Internet for school
+0.006 FB use - sharing music, photos etc.
+0.006 FB use – chatting
+0.006 Everyday FB use
+0.004 Communication by e-mail
+0.004 Education
+0.003 Anxiety temperament
+0.003 Music
+0.002 FB use - reading posts
+0.002 Downloading music and movies
+0.001 Communication on the blog
+0.001 Fast Food
-0.000 Internet Use (in years)
-0.001 FB use –gaming
-0.001 Business
-0.001 Health and medicine
-0.002 Drepressive temperament
-0.003 Expert Advice
-0.004 Politics
-0.004 Online courses
-0.005 Alcohol
-0.005 Sports
-0.006 Reading the news
-0.006 FB use - publishing statuses
-0.006 Computers and technology
-0.006 Pop culture
-0.008 Religion
-0.008 Economic status
-0.008 Communication on the forum
-0.009 Pornography
-0.011 Sports – in minutes
-0.012 Internet Use (in holiday)
-0.013 Search for favorite websites
-0.015 Sports – days in a week
-0.019 Internet Use (hours per day)
-0.021 Attitude about time on the Internet
-0.021 Smoker
-0.021 Energy drinks
-0.024 Sports – intensity
-0.026 Reading and downloading books and texts
-0.039 Surfing
In [37]:
print('False negative predictions.\n')
i = predictions['fn'][0][0]
print('Expected PIU value: {}.\n'.format(y_test[i]))
print('Predicted PIU value: {}.\n'.format(predictions['predicted_values']['Predicted'][i]))

show_prediction(
    predictions['model'],
    x_test.iloc[i,:],
    targets=[0, 1], 
    target_names=['PIU no', 'PIU yes'],
    feature_names=x_test.columns.values.tolist(),
    top=len(x_test.columns)+1
)
False negative predictions.

Expected PIU value: 1.0.

Predicted PIU value: 0.0.

Out[37]:

y=PIU no (probability 0.562) top features

Contribution? Feature
+0.498 <BIAS>
+0.073 Internet Use (hours per week)
+0.053 Social networks
+0.022 Internet Use (hours per day)
+0.020 Attitude about time on the Internet
+0.018 Online games
+0.016 Music
+0.015 Communication on the blog
+0.012 Computers and technology
+0.012 Reading the news
+0.008 Irritable temperament
+0.008 Hyperthymic temperament
+0.007 FB use - sharing music, photos etc.
+0.007 Online courses
+0.007 Reading and downloading books and texts
+0.007 Anxiety temperament
+0.006 FB use - reading posts
+0.006 Internet Use (in holiday)
+0.005 Communication by e-mail
+0.005 Science
+0.005 Search for favorite websites
+0.004 Internet Use (in years)
+0.003 FB use – visiting groups
+0.003 Pop culture
+0.003 Politics
+0.001 Travel/tourism
+0.001 Business
+0.000 Religion
+0.000 Sports – in minutes
-0.000 Internet for school
-0.000 Surfing
-0.000 Drepressive temperament
-0.001 FB use - publishing statuses
-0.002 Expert Advice
-0.002 Targeted Internet search
-0.003 Fast Food
-0.003 Sports – intensity
-0.004 Gender
-0.004 Economic status
-0.004 FB use – chatting
-0.004 Coffee
-0.004 Downloading music and movies
-0.005 Alcohol
-0.008 Education
-0.012 Communication on the forum
-0.013 Sports – days in a week
-0.015 Energy drinks
-0.016 Pornography
-0.016 Health and medicine
-0.017 Everyday FB use
-0.017 Achievement
-0.031 Arts and culture
-0.039 Average time spent on FB
-0.042 Cyclothymic temperament
In [38]:
# show statistics for features in top_n features of different methods
df_top_features = df_top_features.sort_values(by=['top_count_xai'],ascending=False)
print(df_top_features)
                                 attr_names  top_count  top_count_xai  \
31            Internet Use (hours per week)          6              3   
4       Attitude about time on the Internet          6              3   
5                  Average time spent on FB          6              3   
53                                  Surfing          4              3   
18                          Everyday FB use          6              3   
35                    Irritable temperament          4              3   
12                  Cyclothymic temperament          6              3   
30             Internet Use (hours per day)          4              2   
48                          Social networks          3              2   
38                             Online games          3              2   
2                       Anxiety temperament          3              1   
3                          Arts and culture          4              1   
17                            Energy drinks          1              1   
39                                 Politics          0              0   
40                              Pop culture          0              0   
36                                    Music          0              0   
37                           Online courses          0              0   
42  Reading and downloading books and texts          0              0   
34                      Internet for school          1              0   
41                              Pornography          1              0   
0                               Achievement          2              0   
43                         Reading the news          0              0   
44                                 Religion          0              0   
33                  Internet Use (in years)          0              0   
46             Search for favorite websites          0              0   
47                                   Smoker          3              0   
49                                   Sports          1              0   
50                 Sports – days in a  week          0              0   
51                      Sports – in minutes          0              0   
52                       Sports – intensity          0              0   
54                 Targeted Internet search          0              0   
45                                  Science          0              0   
28                      Health and medicine          0              0   
32                Internet Use (in holiday)          0              0   
16                                Education          0              0   
6                                  Business          0              0   
7                                    Coffee          0              0   
8                   Communication by e-mail          0              0   
9                 Communication on the blog          0              0   
10               Communication on the forum          0              0   
11                 Computers and technology          1              0   
13             Downloading music and movies          0              0   
14                  Drepressive temperament          0              0   
15                          Economic status          0              0   
19                            Expert Advice          0              0   
29                  Hyperthymic temperament          2              0   
20             FB use - publishing statuses          0              0   
21                   FB use - reading posts          0              0   
22      FB use - sharing music, photos etc.          0              0   
23                        FB use – chatting          0              0   
24                 FB use – visiting groups          0              0   
25                           FB use –gaming          0              0   
26                                Fast Food          0              0   
27                                   Gender          0              0   
1                                   Alcohol          0              0   
55                           Travel/tourism          1              0   

    top_count_selection  
31                    3  
4                     3  
5                     3  
53                    1  
18                    3  
35                    1  
12                    3  
30                    2  
48                    1  
38                    1  
2                     2  
3                     3  
17                    0  
39                    0  
40                    0  
36                    0  
37                    0  
42                    0  
34                    1  
41                    1  
0                     2  
43                    0  
44                    0  
33                    0  
46                    0  
47                    3  
49                    1  
50                    0  
51                    0  
52                    0  
54                    0  
45                    0  
28                    0  
32                    0  
16                    0  
6                     0  
7                     0  
8                     0  
9                     0  
10                    0  
11                    1  
13                    0  
14                    0  
15                    0  
19                    0  
29                    2  
20                    0  
21                    0  
22                    0  
23                    0  
24                    0  
25                    0  
26                    0  
27                    0  
1                     0  
55                    1  
In [39]:
df_top_features = df_top_features.sort_values(by=['top_count'],ascending=False)
print(df_top_features)
                                 attr_names  top_count  top_count_xai  \
31            Internet Use (hours per week)          6              3   
5                  Average time spent on FB          6              3   
18                          Everyday FB use          6              3   
12                  Cyclothymic temperament          6              3   
4       Attitude about time on the Internet          6              3   
3                          Arts and culture          4              1   
53                                  Surfing          4              3   
35                    Irritable temperament          4              3   
30             Internet Use (hours per day)          4              2   
47                                   Smoker          3              0   
38                             Online games          3              2   
2                       Anxiety temperament          3              1   
48                          Social networks          3              2   
29                  Hyperthymic temperament          2              0   
0                               Achievement          2              0   
11                 Computers and technology          1              0   
41                              Pornography          1              0   
49                                   Sports          1              0   
55                           Travel/tourism          1              0   
34                      Internet for school          1              0   
17                            Energy drinks          1              1   
9                 Communication on the blog          0              0   
39                                 Politics          0              0   
13             Downloading music and movies          0              0   
14                  Drepressive temperament          0              0   
15                          Economic status          0              0   
19                            Expert Advice          0              0   
20             FB use - publishing statuses          0              0   
21                   FB use - reading posts          0              0   
22      FB use - sharing music, photos etc.          0              0   
23                        FB use – chatting          0              0   
24                 FB use – visiting groups          0              0   
25                           FB use –gaming          0              0   
26                                Fast Food          0              0   
27                                   Gender          0              0   
1                                   Alcohol          0              0   
10               Communication on the forum          0              0   
8                   Communication by e-mail          0              0   
42  Reading and downloading books and texts          0              0   
7                                    Coffee          0              0   
43                         Reading the news          0              0   
44                                 Religion          0              0   
33                  Internet Use (in years)          0              0   
46             Search for favorite websites          0              0   
37                           Online courses          0              0   
36                                    Music          0              0   
50                 Sports – days in a  week          0              0   
40                              Pop culture          0              0   
52                       Sports – intensity          0              0   
54                 Targeted Internet search          0              0   
45                                  Science          0              0   
28                      Health and medicine          0              0   
32                Internet Use (in holiday)          0              0   
16                                Education          0              0   
6                                  Business          0              0   
51                      Sports – in minutes          0              0   

    top_count_selection  
31                    3  
5                     3  
18                    3  
12                    3  
4                     3  
3                     3  
53                    1  
35                    1  
30                    2  
47                    3  
38                    1  
2                     2  
48                    1  
29                    2  
0                     2  
11                    1  
41                    1  
49                    1  
55                    1  
34                    1  
17                    0  
9                     0  
39                    0  
13                    0  
14                    0  
15                    0  
19                    0  
20                    0  
21                    0  
22                    0  
23                    0  
24                    0  
25                    0  
26                    0  
27                    0  
1                     0  
10                    0  
8                     0  
42                    0  
7                     0  
43                    0  
44                    0  
33                    0  
46                    0  
37                    0  
36                    0  
50                    0  
40                    0  
52                    0  
54                    0  
45                    0  
28                    0  
32                    0  
16                    0  
6                     0  
51                    0  
In [41]:
df_top_features = df_top_features.sort_values(by=['top_count_selection'],ascending=False)
print(df_top_features)
                                 attr_names  top_count  top_count_xai  \
31            Internet Use (hours per week)          6              3   
18                          Everyday FB use          6              3   
12                  Cyclothymic temperament          6              3   
4       Attitude about time on the Internet          6              3   
3                          Arts and culture          4              1   
5                  Average time spent on FB          6              3   
47                                   Smoker          3              0   
30             Internet Use (hours per day)          4              2   
2                       Anxiety temperament          3              1   
29                  Hyperthymic temperament          2              0   
0                               Achievement          2              0   
11                 Computers and technology          1              0   
34                      Internet for school          1              0   
49                                   Sports          1              0   
41                              Pornography          1              0   
55                           Travel/tourism          1              0   
48                          Social networks          3              2   
53                                  Surfing          4              3   
35                    Irritable temperament          4              3   
38                             Online games          3              2   
43                         Reading the news          0              0   
44                                 Religion          0              0   
33                  Internet Use (in years)          0              0   
46             Search for favorite websites          0              0   
37                           Online courses          0              0   
36                                    Music          0              0   
50                 Sports – days in a  week          0              0   
52                       Sports – intensity          0              0   
40                              Pop culture          0              0   
54                 Targeted Internet search          0              0   
45                                  Science          0              0   
28                      Health and medicine          0              0   
32                Internet Use (in holiday)          0              0   
16                                Education          0              0   
6                                  Business          0              0   
42  Reading and downloading books and texts          0              0   
7                                    Coffee          0              0   
21                   FB use - reading posts          0              0   
8                   Communication by e-mail          0              0   
10               Communication on the forum          0              0   
17                            Energy drinks          1              1   
9                 Communication on the blog          0              0   
39                                 Politics          0              0   
13             Downloading music and movies          0              0   
14                  Drepressive temperament          0              0   
15                          Economic status          0              0   
19                            Expert Advice          0              0   
20             FB use - publishing statuses          0              0   
22      FB use - sharing music, photos etc.          0              0   
23                        FB use – chatting          0              0   
24                 FB use – visiting groups          0              0   
25                           FB use –gaming          0              0   
26                                Fast Food          0              0   
27                                   Gender          0              0   
1                                   Alcohol          0              0   
51                      Sports – in minutes          0              0   

    top_count_selection  
31                    3  
18                    3  
12                    3  
4                     3  
3                     3  
5                     3  
47                    3  
30                    2  
2                     2  
29                    2  
0                     2  
11                    1  
34                    1  
49                    1  
41                    1  
55                    1  
48                    1  
53                    1  
35                    1  
38                    1  
43                    0  
44                    0  
33                    0  
46                    0  
37                    0  
36                    0  
50                    0  
52                    0  
40                    0  
54                    0  
45                    0  
28                    0  
32                    0  
16                    0  
6                     0  
42                    0  
7                     0  
21                    0  
8                     0  
10                    0  
17                    0  
9                     0  
39                    0  
13                    0  
14                    0  
15                    0  
19                    0  
20                    0  
22                    0  
23                    0  
24                    0  
25                    0  
26                    0  
27                    0  
1                     0  
51                    0  
In [40]:
# calcualate statistics of important features for correctly classified instances
top_n_features = ['Internet Use (hours per week)', 
'Average time spent on FB',
'Cyclothymic temperament',
'Attitude about time on the Internet',
'Irritable temperament',
'Surfing',
'Everyday FB use',
'Internet Use (hours per day)',
'Social networks',
'Depressive temperament'
]

relevance.get_important_features_statistics(predictions, x_test, top_n_features)
                            attr_names  score
0        Internet Use (hours per week)     23
1             Average time spent on FB     22
2              Cyclothymic temperament     22
7         Internet Use (hours per day)     18
8                      Social networks     16
4                Irritable temperament     10
5                              Surfing     10
3  Attitude about time on the Internet      3
6                      Everyday FB use      2
9               Depressive temperament      0
Number of tp instances 25.00. Mean number of relevant features per classification result 5.04.
                            attr_names  score
0        Internet Use (hours per week)     34
7         Internet Use (hours per day)     33
1             Average time spent on FB     29
2              Cyclothymic temperament     25
4                Irritable temperament     24
8                      Social networks     24
3  Attitude about time on the Internet     21
5                              Surfing     19
6                      Everyday FB use     12
9               Depressive temperament      0
Number of tn instances 46.00. Mean number of relevant features per classification result 4.80.
                            attr_names  score
1             Average time spent on FB      8
0        Internet Use (hours per week)      7
2              Cyclothymic temperament      6
4                Irritable temperament      6
5                              Surfing      6
8                      Social networks      6
6                      Everyday FB use      2
7         Internet Use (hours per day)      2
3  Attitude about time on the Internet      1
9               Depressive temperament      0
Number of fp instances 9.00. Mean number of relevant features per classification result 4.89.
                            attr_names  score
0        Internet Use (hours per week)     13
7         Internet Use (hours per day)     10
1             Average time spent on FB      8
4                Irritable temperament      8
3  Attitude about time on the Internet      6
5                              Surfing      6
8                      Social networks      5
2              Cyclothymic temperament      4
6                      Everyday FB use      3
9               Depressive temperament      0
Number of fn instances 16.00. Mean number of relevant features per classification result 3.94.